#' ---
#' title: "Greenwashing the Future? Computational Text Analysis of Environmental Reporting from the Fossil Fuel Industry"
#' subtitle: "04_analysis.R"
#' author: "Robin Rauner"
#' date: "Note: Code compiled successfully on `r format(Sys.time(), '%d %B %Y')`"
#' ---

# load packages
library(tidyverse)          # CRAN v2.0.0
library(rio)                # CRAN v1.2.2
library(ggthemes)           # CRAN v5.1.0
library(gridExtra)          # CRAN v2.3
library(quanteda)           # CRAN v4.1.0
library(quanteda.textstats) # CRAN v0.97.2
library(quanteda.textplots) # CRAN v0.95
library(texreg)             # CRAN v1.39.4
library(ggrepel)            # CRAN v0.9.6
library(xtable)             # CRAN v1.8.4

sessionInfo()

# load data
dat_fls_llm <- import("llm_annotated_data.csv"
) |>
    rename(doc_id = speech_id, text = speech_text)

dim(dat_fls_llm)
names(dat_fls_llm)

dat_fls_scores <- import("fls_scores_llm.csv") |>
    select(-V1)
names(dat_fls_scores)
nrow(dat_fls_scores)

dat_report_stats <- import("report_stats.csv")
names(dat_report_stats)

dat_report_stats <- dat_report_stats |>
    mutate(name = recode(name,
        "Ultrapar Participações" = "Ultrapar Participacoes",
        "Koç Holding" = "Koc Holding"
    ))

dat_nzt <- import(
    "nzt_data.xlsx",
    col_types = "text"
)

dat_disclaimer <- import("new_docvars.csv") |>
    mutate(name = recode(name,
        "Ultrapar Participações" = "Ultrapar Participacoes",
        "Koç Holding" = "Koc Holding"
    ))
names(dat_disclaimer)

# Standardize spellings with special characters
dat_nzt <- dat_nzt |>
    mutate(name = recode(name,
        "Ultrapar Participações" = "Ultrapar Participacoes",
        "Koç Holding" = "Koc Holding"
    ))

dat_tpi <- import("Company_Latest_Assessments_5.0.csv")

# Remove duplicates and irrelevant sectors, standardize company names
dat_tpi <- dat_tpi |>
    filter(!`Company Name` %in% c("Origin Energy (Electricity Utility)", "Teck Resources (Diversified mining)") & Sector %in% c("Electricity Utilities", "Oil & Gas", "Utilities", "Oil & Gas Distribution", "Oil & Gas (Other)", "Coal Mining")) |>
        mutate(`Company Name` = recode(`Company Name`,
            "China Coal" = "China Coal Energy",
            "China Shenhua Energy" = "China Shenhua Energy Co",
            "NovaTek" = "Novatek OAO",
            "Petrochina" = "PetroChina Co",
            "PTT Exploration & Production" = "PTT Exploration & Production Public Company",
            "Rosneft Oil" = "Rosneft",
            "Sinopec China Petroleum & Chemical" = "Sinopec",
            "Shell" = "Royal Dutch Shell", 
            "Tokyo Gas" = "Tokyo Gas Co",
            "TATNEFT" = "Tatneft"
        ))

# Merge data
dat_merged <- dat_fls_scores |>
    left_join(dat_nzt, by = c("company" = "name")) |>
    left_join(dat_disclaimer, by = c("company" = "name")) |>
    left_join(dat_tpi |> select(ISINs, Sector), by = c("isin_id" = "ISINs")) |>
    left_join(dat_tpi |> select(`Company Name`, Sector), by = c("company" = "Company Name")) |>
    distinct()

dim(dat_merged)
names(dat_merged)
table(dat_merged$integrated_or_annual_report)
table(dat_merged$Sector.x)
table(dat_merged$Sector.y)
table(dat_merged$fls_disclaimer)

dat_merged |>
    select(company, Sector.x, Sector.y) |>
    filter(is.na(Sector.x) & is.na(Sector.y)) |> nrow()

## Prepare variables
dat_merged <- dat_merged |>
    mutate(
        end_target_year = as.integer(end_target_year), 
        interim_target_year = as.integer(interim_target_year),
        zero2050_target = as.integer(end_target == "Net zero" & end_target_year <= 2050),
        full_coverage = as.integer(scope_1 == "Yes" & scope_2 == "Yes" & scope_3 == "Yes"), # excludes "Partial" scope 3 coverage
        carbon_credit_offsets_ct = as.integer(carbon_credit_offsets == "No"),
        has_plan_ct = as.integer(has_plan == "Yes"),
        accountability_ct = as.integer(accountability == "Yes"),
        log_revenue = log(as.numeric(annual_revenue)),
        region_eu = case_when(
        str_detect(country, "AUT|ESP|FRA|HUN|ITA|NLD|POL|PRT") ~ "European Union",
        TRUE ~ geographic_region),
        coal = case_when(Sector.x == "Coal Mining" | Sector.y == "Coal Mining" ~ 1, TRUE ~ 0)
    )

# Rename regions
dat_merged <- dat_merged |>
    mutate(region_eu = recode(region_eu,
        "Latin America and the Caribbean" = "South America",
        "Western and Central Asia" = "Middle East",
        "Europe" = "Other Europe"
    ))

# Inspect country composition of geographic regions
dat_merged |>
    group_by(region_eu) |>
    summarise(countries = paste(unique(country), collapse = ", ")) |>
    arrange(region_eu)

# Assign climate target values (0-7)
dat_merged <- dat_merged |>
    mutate(climate_target = case_when(
        end_target == "No target" ~ 0, # No target
        zero2050_target == 0 ~ 1, # Not net zero by 2050
        zero2050_target == 1 & full_coverage == 0 ~ 2, # Doesn't cover scope 1, 2, and 3 emissions
        zero2050_target == 1 & full_coverage == 1 &
            interim_target == "No target" | interim_target_year >= 2035 ~ 3, # No 5-10 year interim target
        zero2050_target == 1 & full_coverage == 1 &
            interim_target != "No target" & interim_target_year < 2035 &
            carbon_credit_offsets_ct + has_plan_ct + accountability_ct == 0 ~ 4, # None
        zero2050_target == 1 & full_coverage == 1 &
            interim_target != "No target" & interim_target_year < 2035 &
            carbon_credit_offsets_ct + has_plan_ct + accountability_ct == 1 ~ 5, # One true
        zero2050_target == 1 & full_coverage == 1 &
            interim_target != "No target" & interim_target_year < 2035 &
            carbon_credit_offsets_ct + has_plan_ct + accountability_ct == 2 ~ 6, # Two true
        zero2050_target == 1 & full_coverage == 1 &
            interim_target != "No target" & interim_target_year < 2035 &
            carbon_credit_offsets_ct + has_plan_ct + accountability_ct == 3 ~ 7, # Three true
        TRUE ~ 999
    ))

unique(dat_merged$climate_target)
table(dat_merged$climate_target)

# Mean future focus by climate target
dat_merged |>
    group_by(climate_target) |>
    summarise(
        n = n(),
        mean_fls = mean(fls_score, na.rm = TRUE),
        sd_fls = sd(fls_score, na.rm = TRUE)
    ) |>
    arrange(climate_target)

dat_merged |>
    filter(climate_target >= 4) |>
    select(company, country, region_eu, climate_target)

dat_merged |>
    filter(climate_target >= 4) |>
    select(company, region_eu, climate_target) |>
    count(region_eu)

dat_merged |>
    filter(climate_target >= 4) |>
    select(company, country, climate_target) |>
    count(country)

# Alternate climate target values (0-4)
dat_merged <- dat_merged |>
    mutate(climate_target_alt = case_when(
        end_target == "No target" ~ 0, # No target
        zero2050_target == 0 ~ 1, # Not net zero by 2050
        zero2050_target == 1 & full_coverage == 0 ~ 2, # Doesn't cover scope 1, 2, and 3 emissions
        zero2050_target == 1 & full_coverage == 1 &
            interim_target == "No target" | interim_target_year >= 2035 ~ 3, # No 5-10 year interim target
        zero2050_target == 1 & full_coverage == 1 &
            interim_target != "No target" & interim_target_year < 2035 ~ 4, # Has 5-10 year interim target
        TRUE ~ 999
    ))

unique(dat_merged$climate_target_alt)
table(dat_merged$climate_target_alt)

# Create dummy variables for geographic regions
dat_merged <- dat_merged |>
    mutate(
        eu_dummy = as.integer(region_eu == "European Union"),
        e_asia_dummy = as.integer(region_eu == "East Asia"),
        europe_dummy = as.integer(region_eu == "Other Europe"),
        s_am_dummy = as.integer(region_eu == "South America"),
        n_am_dummy = as.integer(region_eu == "North America"),
        oceania_dummy = as.integer(region_eu == "Oceania"),
        s_asia_dummy = as.integer(region_eu == "South Asia"),
        mid_east_dummy = as.integer(region_eu == "Middle East")
    )

names(dat_merged)

## Corpus overview (Table A1)
tab_overview <- dat_merged |>
    mutate(company = case_when(
        company == "Ultrapar Participacoes" ~ "Ultrapar Participações",
        company == "Koc Holding" ~ "Koç Holding",
        TRUE ~ company
    )) |>
    mutate(
        fls_score = round(fls_score, 2), 
        climate_target = as.character(climate_target)
        ) |>
    select(c(
        company, country,
        climate_target, fls_score
    )) |>
    rename(
        `Company Name` = company,
        `Country` = country,
        `Climate Target` = climate_target,
        `Future Focus` = fls_score
    ) |>
    arrange(`Climate Target`, `Future Focus`)

head(tab_overview)
print(xtable(tab_overview, caption = "List of companies represented in the text corpus.",
      label = "tab:corpus_overview", digits = 2),
    file = "corpus_overview.tex",
    type = "latex", booktabs = TRUE,
    caption.placement = "top",
    tabular.environment = "longtable", floating = FALSE
)


## Keyness analysis (Figure 3)
names(dat_fls_llm)

# Create company variable from doc_id
dat_fls_llm <- dat_fls_llm |>
    mutate(company = str_extract(doc_id, "^[^.]+"))

dat_fls_llm |>
    select(company) |>
    distinct()

# Merge report type variable
dat_fls_llm <- dat_fls_llm |>
    left_join(
        dat_report_stats |>
            select(name, integrated_or_annual_report),
        by = c("company" = "name")
    )

dat_fls_llm |>
    group_by(company) |>
    select(company, integrated_or_annual_report) |>
    distinct() |>
    print(n = Inf)

dat_fls_llm <- dat_fls_llm |>
    mutate(integrated_or_annual_report = replace_na(integrated_or_annual_report, 0))

dat_fls_llm |>
    select(company, integrated_or_annual_report) |>
    slice_sample(n = 10)

# get corpus statistics
corp_messages <- corpus(dat_fls_llm)

corp_grouped <- corp_messages |>
    corpus_group(groups = company)

sumcorp <- summary(corp_grouped)

sum(sumcorp$Sentences) / 97
sum(sumcorp$Tokens) / 97

# tokenize corpus
toks <- dat_fls_llm |>
    corpus() |>
    tokens()

# identify multiword expressions (size: 2, 3, 4)
# set seed
set.seed(23)
mwes <- toks |>
    tokens(remove_punct = TRUE) |>
    tokens_keep(min_nchar = 2) |>
    tokens_remove(pattern = stopwords("en"), padding = TRUE) |>
    textstat_collocations(size = 2:4, min_count = 10)

# create a document-feature matrix after compounding mwes
# then group dfm into fls = 1/0
dfmat <- toks |>
    tokens_keep(min_nchar = 2) |>
    tokens_compound(pattern = mwes) |>
    dfm() |>
    dfm_remove(pattern = c(
        "to", "be", "are", "we",
        "that", "of", "than", "our",
        "in", "while", "and", "also"
    )) |>
    dfm_group(groups = fls)

# conduct keyness analysis and give positive values to
# words in the target category = 1
tstat_key <- textstat_keyness(dfmat, target = "1")

# plot top 20 words
textplot_keyness(
    tstat_key,
    n = 20,
    labelsize = 5,
    labelcolor = "grey20"
) +
    xlab("Chi Square Association") +
    theme_tufte(base_size = 18) +
    theme(
        panel.border = element_rect(
            fill = NA, colour = "black", linewidth = 0.5,
            linetype = "solid"
        ),
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.title = element_blank(),
        legend.position = "bottom",
        legend.text = element_text(size = 18)
    ) +
    scale_color_manual(
        values = c("grey20", "grey80"),
        labels = c("Forward-looking", "Not forward-looking")
    )

# save plot
ggsave("keyness.pdf", width = 9, height = 10)


# Plot most frequent terms (Figure A1)
dfmat |>
    dfm_remove(pattern = stopwords("en")) |>
    textstat_frequency(groups = fls, n = 25) |>
    ggplot(aes(
        x = frequency,
        y = reorder(feature, frequency),
        color = group
    )) +
    geom_point(
        shape = 21,
        color = "grey20",
        size = 3,
        aes(fill = factor(group))
    ) +
    labs(x = "Frequency", y = NULL) +
    theme_tufte(base_size = 18) +
    scale_fill_manual(
        values = c("white", "grey20"),
        labels = c("Not forward-looking", "Forward-looking")
    ) +
    theme(
        panel.border = element_rect(
            fill = NA, colour = "black", linewidth = 0.5,
            linetype = "solid"
        ),
        legend.title = element_blank(),
        legend.position = "bottom"
    )

# save plot
ggsave("frequency.pdf", width = 9, height = 9)


# tokenize corpus after filtering for FLS = 1
toks_fls <- dat_fls_llm |>
    filter(fls == 1) |>
    corpus() |>
    tokens()

# identify multiword expressions (size: 2, 3, 4)
# set seed
set.seed(23)
mwes_fls <- toks_fls |>
    tokens(remove_punct = TRUE) |>
    tokens_keep(min_nchar = 2) |>
    tokens_remove(pattern = stopwords("en"), padding = TRUE) |>
    textstat_collocations(size = 2:4, min_count = 10)

# List company names
companies <- c(
    tolower(unique(dat_fls_llm$company)) |>
        str_replace_all(" ", "_"),
    c("pttep", "hengli", "tatneft’s", "halliburton",
    "cosmo_energy", "cosmo_energy_group", "agl", "agl’s",
    "jean-pierre", "clamadieu")
)
companies

# Compare report types
dfmat_report_type <- toks_fls |>
    tokens_keep(min_nchar = 2) |>
    tokens_compound(pattern = mwes) |>
    dfm() |>
    dfm_remove(pattern = c(
        "to", "be", "are", "we",
        "that", "of", "than", "our",
        "in", "while", "and", "also",
        "thailand", "tokyo", "spain", "europe", "european",
        "dx", "gx", "it", "which", "gw", "co",
        "is", "so", "the", "mtmp", "seventh", 
        companies
    )) |>
    dfm_group(groups = integrated_or_annual_report)

tstat_key_report_type <- textstat_keyness(dfmat_report_type, target = "1")

# List top 50 keywords for report type comparison
top_50_target <- tstat_key_report_type[1:50, ]
print(top_50_target)

top_50_reference <- tstat_key_report_type |>
    arrange(chi2) |>
    slice(1:50)
print(top_50_reference)

tab_report_type_features <- top_50_target |>
    select(feature, chi2) |>
    bind_cols(top_50_reference |> select(feature, chi2))
tab_report_type_features

# Save table as LaTeX (Table A6)
print(
    xtable(tab_report_type_features,
        caption = "Top 50 predictive terms for integrated vs. sustainability reports",
        label = "tab:report_type_features"
    ),
    file = "report_type_features.tex",
    include.rownames = FALSE,
    caption.placement = "top",
    booktabs = TRUE,
    table.placement = "h!"
)

# Mean fls_score by report type
dat_fls_llm |>
    group_by(integrated_or_annual_report) |>
    summarise(mean_fls = mean(fls))

# Predictive value of forward-looking terms extracted from FLS disclaimers
dat_fls_terms <- import("fls_terms_all.csv") |>
    select(forward_looking_statements) |>
    rename(terms = forward_looking_statements) |>
    filter(terms != "") |>
    mutate(terms = str_remove_all(terms, "[\"\\'‘’“”\n]")) |>
    summarise(terms = paste(terms, collapse = ", "))

dat_fls_terms
dat_fls_terms <- dat_fls_terms |>
    separate_rows(terms, sep = "[,;]") |>
    mutate(terms = str_trim(terms)) |>
    filter(terms != "")

# Get term frequencies in FLS disclaimers
dat_fls_terms |>
    count(terms, sort = TRUE) |>
    rename(
        Term = terms,
        Frequency = n
    ) |>
    print(n = Inf)

class(mwes)
head(mwes)

dat_fls_mwe <- dat_fls_terms |>
    select(terms) |>
    filter(str_detect(terms, " ")) |>
    as.list()
class(dat_fls_mwe)
dat_fls_mwe

dfmat_fls_mwe <- toks |>
    tokens_keep(min_nchar = 2) |>
    tokens_compound(pattern = dat_fls_mwe) |>
    dfm() |>
    dfm_remove(pattern = c(
        "to", "be", "are", "we",
        "that", "of", "than", "our",
        "in", "while", "and", "also"
    )) |>
    dfm_group(groups = fls)

# Get chi2 for terms in FLS disclaimers
tstat_key_fls_terms <- textstat_keyness(dfmat_fls_mwe, target = "1")
names(tstat_key_fls_terms)
tstat_key_fls_terms |>
    filter(feature == "looking_forward")

dat_fls_terms <- dat_fls_terms |>
    mutate(terms_concat = str_replace_all(terms, " ", "_"))
dat_fls_terms

keyness_fls_terms <- tstat_key_fls_terms |>
    filter(feature %in% dat_fls_terms$terms_concat) |>
    arrange(desc(chi2)) |>
    mutate(
        n_target = as.integer(n_target),
        n_reference = as.integer(n_reference)
    )

names(keyness_fls_terms)

# Save table as LaTeX (Table A5)
print(
    xtable(keyness_fls_terms |> filter(chi2 > 5) |> select(-p),
        caption = "Predictive value of forward-looking terms",
        label = "tab:fls_terms_chi2"
    ),
    file = "fls_terms_chi2.tex",
    include.rownames = FALSE,
    caption.placement = "top",
    booktabs = TRUE,
    table.placement = "h!"
)


## Future focus by geographic region (Figure 4)
# calculate the mean score for each region
region_means <- dat_merged |>
    group_by(region_eu) |>
    summarise(mean_fls = mean(fls_score))

print(region_means)

# create a scatter plot of FLS scores with panels for each region
ggplot(dat_merged, aes(x = company, y = fls_score)) +
    geom_hline(
        data = region_means,
        aes(yintercept = mean_fls),
        linetype = "dashed"
    ) +
    geom_point(color = "grey20", size = 2) +
    facet_wrap(vars(region_eu)) +
    labs(
        y = "Future Focus", 
        x = NULL
    ) +
    theme_tufte(base_size = 20) +
    theme(
        panel.border = element_rect(
            fill = NA, colour = "black", linewidth = 0.5,
            linetype = "solid"
        ),
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.title = element_text(size = 18)
    )

ggsave("fls_scores_by_region.pdf", width = 9, height = 7)


## Mean future focus and standard deviation by region (Figure A2)

df_mean_std <- dat_merged |>
    group_by(region_eu) |>
    summarise_at(vars(fls_score), list(mean = mean, sd = sd)) |>
    as.data.frame()

df_mean_std

ggplot(df_mean_std, aes(x = mean, y = region_eu)) +
    geom_errorbar(aes(xmin = mean - sd, xmax = mean + sd), width = .3) +
    geom_point(size = 3, color = "grey20") +
    labs(
        x = NULL,
        y = NULL
    ) +
    theme_tufte(base_size = 20) +
    theme(
        panel.border = element_rect(
            fill = NA, colour = "black", linewidth = 0.5,
            linetype = "solid"
        )
    )

ggsave("fls_scores_mean_sd.pdf", width = 9, height = 7)

## Future focus by climate target (Figure 5)
ggplot(dat_merged, aes(x = as.factor(climate_target), y = fls_score)) +
    geom_boxplot(outlier.shape = NA) +
    geom_point(size = 3.5, alpha = 0.5) +
    labs(
        x = "Climate Target",
        y = "Future Focus"
    ) +
    scale_x_discrete(
        limits = as.character(0:7),
        labels = c(
            "0" = "0\nNo target",
            "1" = "1\nLess\nsubstantive",
            "2" = "2",
            "3" = "3",
            "4" = "4",
            "5" = "5",
            "6" = "6",
            "7" = "7\nMore\nsubstantive"
        )
    ) +
    theme_tufte(base_size = 18) +
    theme(
        panel.border = element_blank(),
        axis.line = element_line(color = "black", size = 0.5),
        plot.margin = margin(5.5, 60, 5.5, 5.5),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18)
    ) +
    geom_text(
        data = dat_merged |>
            filter(fls_score > quantile(fls_score, 0.95, na.rm = TRUE) |
                fls_score < quantile(fls_score, 0.05, na.rm = TRUE)),
        aes(label = company),
        size = 6,
        hjust = -0.07,
        vjust = -0.15,
        check_overlap = TRUE
    ) +
    coord_cartesian(clip = "off")

ggsave("fw_matrix_plot.pdf", width = 10, height = 7)

## Run regressions (Table A7)
model1 <- lm(fls_score ~ climate_target, data = dat_merged)

model2 <- lm(
    fls_score ~ climate_target + log_revenue,
    data = dat_merged
)

# North America as reference category
model3 <- lm(
    fls_score ~ climate_target + log_revenue + 
        e_asia_dummy + europe_dummy + s_am_dummy + 
        eu_dummy + oceania_dummy + s_asia_dummy + mid_east_dummy,
    data = dat_merged
)

model4 <- lm(
    fls_score ~ climate_target + log_revenue +
        e_asia_dummy + europe_dummy + s_am_dummy +
        eu_dummy + oceania_dummy + s_asia_dummy + mid_east_dummy +
        coal,
    data = dat_merged
)

dat_merged |>
    select(company, fls_disclaimer)

model5 <- lm(
    fls_score ~ climate_target + log_revenue +
        e_asia_dummy + europe_dummy + s_am_dummy +
        eu_dummy + oceania_dummy + s_asia_dummy + mid_east_dummy +
        coal + fls_disclaimer,
    data = dat_merged
)

screenreg(list(model1, model2, model3, model4, model5), digits = 3, stars = c(0.1, 0.05, 0.01))

texreg(list(model1, model2, model3, model4, model5),
    digits = 3, stars = c(0.1, 0.05, 0.01),
    custom.coef.names = c(
        "(Intercept)",
        "Climate Target",
        "Revenue",
        "East Asia", "Other Europe", "South America", "European Union",
        "Oceania", "South Asia", "Middle East", "Coal", "Disclaimer"
    ),
    caption = "Explanatory factors for variation in future focus",
    caption.above = TRUE,
    label = "tab:regressions_future_focus",
    file = "regressions_future_focus.tex",
    use.packages = FALSE,
    booktabs = TRUE
)